In [1]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn import linear_model
from sklearn import tree
In [2]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]
Out[2]:
In [3]:
plt.plot(train.casual)
plt.show()
plt.plot(train.registered)
plt.show()
plt.plot(train.total_rentals)
plt.show()
In [20]:
X = train.ix[:,1:9]
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in train.datetime]
years = [d.year for d in dts]
months = [d.month for d in dts]
days = [d.day for d in dts]
hours = [d.hour for d in dts]
X.insert(0,'hour',hours)
X.insert(0,'day',days)
X.insert(0,'month',months)
X.insert(0,'year',years)
pca5 = decomposition.PCA(n_components=5)
X_pc = pca5.fit_transform(X)
print X_pc.shape
plt.plot(X_pc[:,1],X_pc[:,4],'.')
plt.show()
In [21]:
y = train.ix[:,-1:-4:-1]
y[:2]
Out[21]:
In [22]:
test_set = pd.read_csv('test.csv')
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in test_set.datetime]
years = [d.year for d in dts]
months = [d.month for d in dts]
days = [d.day for d in dts]
hours = [d.hour for d in dts]
print len(months), len(hours)
test_set.insert(1,'hour',hours)
test_set.insert(1,'day',days)
test_set.insert(1,'month',months)
test_set.insert(1,'year',years)
print test_set.shape
pca5 = decomposition.PCA(n_components=5)
test_set_pc = pca5.fit_transform(test_set.ix[:,1:])
print test_set_pc.shape
plt.plot(test_set_pc[:,1],test_set_pc[:,4],'.')
plt.show()
In [25]:
dt = tree.DecisionTreeRegressor()
cas = reg = y_tot = y_reg = y_cas = np.asarray([])
mean_score = 0
for yr in range(2011,2013):
for m in range(1,13):
X_lyr = X_pc[X.year < yr]
X_tyr = X_pc[X.year == yr][X.month <= m]
y_lyr = y[X.year < yr]
y_tyr = y[X.year == yr][X.month <= m]
X_ = pd.concat([X_lyr,X_tyr])
y_ = pd.concat([y_lyr,y_tyr])
dt.fit(X_,y_)
pred = dt.predict(test_set_pc[test_set.month == m][test_set.year == yr].ix[:,1:])
# lm.fit(X[X.month <= m][X.year <= yr],y[X.month <= m][X.year <= yr])
# pred = lm.predict(test_set[test_set.month == m][test_set.year == yr].ix[:,1:])
mean_score += dt.score(X_,y_)/24
y_tot = np.append(y_tot, train.total_rentals[X.month == m][X.year == yr]*0)
cas = np.append(cas, train.casual[X.month == m][X.year == yr])
reg = np.append(reg, train.registered[X.month == m][X.year == yr])
y_tot = np.append(y_tot, pred[:,0])
y_reg = np.append(y_reg, pred[:,1])
y_cas = np.append(y_cas, pred[:,2])
cas = np.append(cas, [0]*pred.shape[0])
reg = np.append(reg, [0]*pred.shape[0])
print mean_score
# print lm.alpha_
y_cas[y_cas < 0] = 0
y_reg[y_reg < 0] = 0
y_tot[y_tot < 0] = 0
plt.plot(y_cas)
plt.plot(y_reg)
plt.plot(y_cas+y_reg)
#plt.plot(y_tot,'.')
plt.show()
In [35]:
plt.plot(y_tot[200:600],'.--')
plt.plot(cas[200:600]+reg[200:600],'.--')
# plt.plot(reg[0:1000],'.')
plt.show()
plt.plot(y_tot,'.')
plt.plot(cas+reg,'.')
# plt.plot(reg,'.')
plt.show()
In [36]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
print sample_submission.shape
my_submission = sample_submission.copy()
new_labels = my_submission.columns.values
new_labels[-1] = 'total_rentals'
my_submission.columns = new_labels
my_submission.total_rentals = np.round(y_cas+y_reg)
plt.plot(my_submission.total_rentals)
print my_submission.shape
my_submission[:5]
Out[36]:
In [37]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('decision_tree_2.csv',index=False)
In [ ]: